#imports
import datetime
import requests
import warnings
import matplotlib
import matplotlib.dates as mdates
import seaborn as sns
import plotly.offline as py
import plotly_express as px
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, add_changepoints_to_plot
from itertools import cycle, islice
from IPython.display import Image
warnings.filterwarnings('ignore')
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import folium
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bar_chart_race as bcr
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
import seaborn as sns
%matplotlib inline
import math
import random
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')
#color pallette
cnf='#39e46'
dth='#ff2e63'
rec='#21bf73'
act='#fe9801'
labels = ['Missing', 'Male', 'Female']
sizes = []
sizes.append(individual_details['gender'].isnull().sum())
sizes.append(list(individual_details['gender'].value_counts())[0])
sizes.append(list(individual_details['gender'].value_counts())[1])
explode = (0, 0.1, 0)
colors = ['#ffcc99','#66b3ff','#ff9999']
plt.figure(figsize= (15,10))
plt.title('Percentage of Gender',fontsize = 20)
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',shadow=True, startangle=90)
plt.axis('equal')
plt.tight_layout()
#DATA PREPARATION
import plotly as py
py.offline.init_notebook_mode(connected=True)
import os
try:
os.system("rm -rf Covid-19-Preprocessed-Dataset")
except:
print("file does not exist")
df=pd.read_csv('covid_19_data_cleaned.csv',parse_dates=['Date'])
country_daywise=pd.read_csv('country_daywise.csv',parse_dates=['Date'])
countrywise=pd.read_csv('countrywise.csv')
daywise=pd.read_csv('daywise.csv',parse_dates=['Date'])
df.head()
df['Province/State']=df['Province/State'].fillna("")
df.head()
df['ProvinceID'] = le.fit_transform(df['Province/State'])
df['CountryID']=le.fit_transform(df['Country'])
df.head()
corr= df.corr()
sns.heatmap(corr,annot=True)
#There is no strong correlation between any of the variables except for Confirmed and Deaths variables (0.76)
fig = plt.figure(figsize=(10,10))
conf_per_country = df.groupby('Country')['Confirmed'].sum().sort_values(ascending=False)
conf_sum=df['Confirmed'].sum()
def absolute_value(val):
a = val
return (np.round(a,2))
conf_per_country.plot(kind="pie",title='Percentage of confirmed cases per country',autopct=absolute_value)
plt.show ()
group_cases=df[['Confirmed','Recovered','Deaths','Country']].groupby('Country').max().sort_values('Confirmed',ascending=False).head()
group_cases=group_cases.reset_index()
group_cases
#Among the top 5 countries with maximum confirmed cases Russia is doing better followed by India if we consider both the recovery and the death rate
group_cases['Recovery Rate']=round(group_cases['Recovered'] / group_cases['Confirmed'],2)
group_cases['Death Rate']=round(group_cases['Deaths'] / group_cases['Confirmed'],2)
group_cases=group_cases.sort_values(by='Confirmed', ascending= False)
group_cases.style.background_gradient(cmap='Greens')
country_daywise.head()
countrywise.head()
daywise.head()
confirmed=df.groupby('Date').sum()['Confirmed'].reset_index()
recovered=df.groupby('Date').sum()['Recovered'].reset_index()
deaths=df.groupby('Date').sum()['Deaths'].reset_index()
deaths.head()
df.isnull().sum()
df.info()
df.query('Country=="US"')
confirmed.tail()
recovered.tail()
deaths.tail()
fig=go.Figure()
fig.add_trace(go.Scatter(x=confirmed['Date'],y=confirmed['Confirmed'],mode='lines+markers',name='Confirmed',line=dict(color="Orange",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=recovered['Recovered'],mode='lines+markers',name='Recovered',line=dict(color="Green",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=deaths['Deaths'],mode='lines+markers',name='Deaths',line=dict(color="Red",width=4)))
fig.update_layout(title='Worldwide COVID19 Cases',xaxis_tickfont_size=14,yaxis=dict(title='Number of cases'))
fig.show()
#CASES DENSITY
df.info()
df['Date']=df['Date'].astype(str)
df.info()
df.head()
fig=px.density_mapbox(df,lat='Lat',lon='Long',hover_name='Country',hover_data=['Confirmed','Recovered','Deaths'],animation_frame='Date',color_continuous_scale='Portland',radius=7,zoom = 0,height=700)
fig.update_layout(title='Worldwide COVID-19 Cases with Time laps')
fig.update_layout(mapbox_style='open-street-map',mapbox_center_lon=0)
fig.show()
temp=df.groupby('Date')['Confirmed','Deaths','Recovered','Active',].sum().reset_index()
temp=temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
tm=temp.melt(id_vars='Date',value_vars=['Active','Deaths','Recovered'])
fig=px.treemap(tm,path=['variable'],values='value',height=250,width=800,color_discrete_sequence=[act,rec,dth])
fig.data[0].textinfo='label+text+value'
fig.show()
temp=df.groupby('Date')['Recovered','Deaths','Active',].sum().reset_index()
temp=temp.melt(id_vars='Date',value_vars=['Recovered','Deaths','Active'],var_name='Case',value_name='Count')
fig=px.area(temp,x='Date',y='Count',color='Case',height=600,title='Cases over time',color_discrete_sequence=[rec,dth,act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
temp=df[df['Date']==max(df['Date'])]
m=folium.Map(location=[0,0],tiles='cartodbpositron',min_zoom=1,max_zoom=4,zoom_start=1)
for i in range(0,len(temp)):
folium.Circle(location=[temp.iloc[i]['Lat'],temp.iloc[i]['Long']],color= 'crimson',fill='crimson',
tooltip='<li><bold> Country:'+str(temp.iloc[i]['Country'])+
'<li><bold> Province:'+str(temp.iloc[i]['Province/State'])+
'<li><bold> Confirmed:'+str(temp.iloc[i]['Confirmed'])+
'<li><bold> Deaths:'+str(temp.iloc[i]['Deaths']),
radius =int(temp.iloc[i]['Confirmed'])**0.5).add_to(m)
m
fig=px.choropleth(country_daywise,locations='Country',locationmode='country names',color=country_daywise['Confirmed'],
hover_name='Country',animation_frame=country_daywise['Date'].dt.strftime('%Y-%m-%d'),
title='Cases over time',color_continuous_scale=px.colors.sequential.Inferno)
fig.update(layout_coloraxis_showscale=True)
fig.show()
fig_c=px.bar(daywise,x='Date',y='Confirmed',color_discrete_sequence=[act])
fig_d=px.bar(daywise,x='Date',y='Deaths',color_discrete_sequence=[dth])
fig=make_subplots(rows=1,cols=2,shared_xaxes=False,horizontal_spacing=0.1,
subplot_titles=('Confirmed Cases','Death Cases'))
fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)
fig.update_layout(height=500)
fig.show()
daywise.columns
fig1=px.line(daywise,x='Date',y='Deaths / 100 Cases',color_discrete_sequence=[dth])
fig2=px.line(daywise,x='Date',y='Recovered / 100 Cases',color_discrete_sequence=[rec])
fig3=px.line(daywise,x='Date',y='Deaths / 100 Recovered',color_discrete_sequence=['aqua'])
fig=make_subplots(rows=1,cols=3,shared_xaxes=False,
subplot_titles=('Deaths / 100 Cases','Recovered / 100 Cases','Deaths / 100 Recovered'))
fig.add_trace(fig1['data'][0],row=1,col=1)
fig.add_trace(fig2['data'][0],row=1,col=2)
fig.add_trace(fig3['data'][0],row=1,col=3)
fig.update_layout(height=400)
fig.show()
fig_c=px.choropleth(countrywise,locations='Country',locationmode='country names',
color=np.log(countrywise['Confirmed']), hover_name='Country',
hover_data=['Confirmed'])
temp=countrywise[countrywise['Deaths']>0]
fig_d=px.choropleth(temp,locations='Country',locationmode='country names',
color=np.log(temp['Deaths']), hover_name='Country',
hover_data=['Deaths'])
fig=make_subplots(rows=1,cols=2,subplot_titles=['Confirmed','Deaths'],
specs=[[{'type':'choropleth'},{'type':'choropleth'}]] )
fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(height=1000,width=1000)
fig.show()
fig_c=px.bar(daywise,x='Date',y='Confirmed',color_discrete_sequence=[act])
fig_d=px.bar(daywise,x='Date',y='No. of Countries',color_discrete_sequence=[dth])
fig=make_subplots(rows=1,cols=2,shared_xaxes=False,horizontal_spacing=0.1,
subplot_titles=('No. of new Cases per day','No. of Countries'))
fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)
fig.update_layout(height=400)
fig.show()
top=15
fig_c=px.bar(countrywise.sort_values('Confirmed').tail(top),x='Confirmed',y='Country',
text='Confirmed',orientation='h',color_discrete_sequence=[act])
fig_d=px.bar(countrywise.sort_values('Deaths').tail(top),x='Deaths',y='Country',
text='Deaths',orientation='h',color_discrete_sequence=[dth])
fig_a=px.bar(countrywise.sort_values('Active').tail(top),x='Active',y='Country',
text='Active',orientation='h',color_discrete_sequence=['#434343'])
fig_r=px.bar(countrywise.sort_values('Recovered').tail(top),x='Recovered',y='Country',
text='Recovered',orientation='h',color_discrete_sequence=[rec])
fig_dc=px.bar(countrywise.sort_values('Deaths / 100 Cases').tail(top),x='Deaths / 100 Cases',y='Country',
text='Deaths / 100 Cases',orientation='h',color_discrete_sequence=['#f84351'])
fig_rc=px.bar(countrywise.sort_values('Recovered / 100 Cases').tail(top),x='Recovered / 100 Cases',y='Country',
text='Recovered / 100 Cases',orientation='h',color_discrete_sequence=['#a45398'])
fig_nc=px.bar(countrywise.sort_values('New Cases').tail(top),x='New Cases',y='Country',
text='New Cases',orientation='h',color_discrete_sequence=['#f04341'])
temp=countrywise[countrywise['Population']>1000000]
fig_p=px.bar(temp.sort_values('Cases / Million People').tail(top),x='Cases / Million People',y='Country',
text='Cases / Million People',orientation='h',color_discrete_sequence=['#b40398'])
fig_wc=px.bar(countrywise.sort_values('1 week change').tail(top),x='1 week change',y='Country',
text='1 week change',orientation='h',color_discrete_sequence=['#c04041'])
temp=countrywise[countrywise['Confirmed']>100]
fig_wi=px.bar(temp.sort_values('1 week % increase').tail(top),x='1 week % increase',y='Country',
text='1 week % increase',orientation='h',color_discrete_sequence=['#b00398'])
fig=make_subplots(rows=5,cols=2,shared_xaxes=False,horizontal_spacing=0.2,vertical_spacing=.05,
subplot_titles=('Confirmed Cases','Deaths Reported','Recovered Cases','Active Cases','Deaths / 100 Cases','Recovered / 100 Cases','New Cases','Cases / Million People','1 week change','1 week % increase'))
fig.add_trace(fig_c['data'][0],row=1,col=1)
fig.add_trace(fig_d['data'][0],row=1,col=2)
fig.add_trace(fig_r['data'][0],row=2,col=1)
fig.add_trace(fig_a['data'][0],row=2,col=2)
fig.add_trace(fig_dc['data'][0],row=3,col=1)
fig.add_trace(fig_rc['data'][0],row=3,col=2)
fig.add_trace(fig_nc['data'][0],row=4,col=1)
fig.add_trace(fig_p['data'][0],row=4,col=2)
fig.add_trace(fig_wc['data'][0],row=5,col=1)
fig.add_trace(fig_wi['data'][0],row=5,col=2)
fig.update_layout(height=4000)
fig.show()
#Wikipedia Source
epidemics=pd.DataFrame({
'epidemic':['COVID-19','SARS','EBOLA','MERS','H1N1'],
'start_year':[2019,2002,2013,2012,2009],
'end_year':[2020,2004,2016,2020,2010],
'confirmed':[countrywise['Confirmed'].sum(),8422,28646,2519,6724149],
'deaths':[countrywise['Deaths'].sum(),813,11323,866,19654]
})
epidemics['mortality']=round((epidemics['deaths']/epidemics['confirmed'])*100,2)
epidemics.head()
temp=epidemics.melt(id_vars='epidemic',value_vars=['confirmed','deaths','mortality'],var_name='Case',value_name='Value')
fig=px.bar(temp,x='epidemic',y='Value',color='epidemic',text='Value',facet_col='Case',color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=8,uniformtext_mode='hide')
fig.update_yaxes(showticklabels=False)
fig.layout.yaxis2.update(matches=None)
fig.layout.yaxis3.update(matches=None)
fig.show()
data_corona=pd.read_csv("total_cases.csv")
data_corona.head()
cols=['date','Italy','Spain','Australia','Brazil','India','Colombia','Pakistan','Mexico','Peru','United States','Russia','South Africa','Chile','Iran','Argentina','United Kingdom','Saudi Arabia','Bangladesh',
'Pakistan','Turkey','France','Germany']
Subsetdf=data_corona[cols]
Subsetdf.set_index("date",inplace=True)
bcr.bar_chart_race(df=Subsetdf,filename=None,figsize=(5,4.5),title='COVID-19 Cases by Country')
data_corona=pd.read_csv("india_daywise.csv")
data_corona.head()
cols=['date','Andaman and Nicobar Islands','Andhra Pradesh','Arunachal Pradesh','Assam','Bihar','Chandigarh','Chhattisgarh','Dadar Nagar Haveli',
'Delhi','Goa','Gujarat','Haryana','Himachal Pradesh','Jammu and Kashmir','Jharkhand','Karnataka','Kerala','Ladakh','Madhya Pradesh',
'Maharashtra','Manipur','Meghalaya','Mizoram','Nagaland','Odisha','Puducherry','Punjab','Rajasthan','Sikkim','Tamil Nadu','Telangana','Tripura'
,'Uttar Pradesh','Uttarakhand','West Bengal']
Subsetdf=data_corona[cols]
Subsetdf.set_index("date",inplace=True)
Subsetdf.tail(10)
bcr.bar_chart_race(df=Subsetdf,filename=None,figsize=(5,4.5),title='COVID-19 Cases Statewise in INDIA')
dp=df.query('Country=="India"')
dp.head()
dp['Country']=dp['Country'].fillna("")
dp.isnull().sum()
dp.head()
confirmed=dp.groupby('Date').sum()['Confirmed'].reset_index()
recovered=dp.groupby('Date').sum()['Recovered'].reset_index()
deaths=dp.groupby('Date').sum()['Deaths'].reset_index()
fig=go.Figure()
fig.add_trace(go.Scatter(x=confirmed['Date'],y=confirmed['Confirmed'],mode='lines+markers',name='Confirmed',line=dict(color="Orange",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=recovered['Recovered'],mode='lines+markers',name='Recovered',line=dict(color="Green",width=4)))
fig.add_trace(go.Scatter(x=confirmed['Date'],y=deaths['Deaths'],mode='lines+markers',name='Deaths',line=dict(color="Red",width=4)))
fig.update_layout(title='India COVID19 Cases',xaxis_tickfont_size=14,yaxis=dict(title='Number of cases'))
fig.show()
temp=dp.groupby('Date')['Confirmed','Deaths','Recovered','Active',].sum().reset_index()
temp=temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
tm=temp.melt(id_vars='Date',value_vars=['Active','Deaths','Recovered'])
fig=px.treemap(tm,path=['variable'],values='value',height=250,width=800,color_discrete_sequence=[act,rec,dth])
fig.data[0].textinfo='label+text+value'
fig.show()
age_details = pd.read_csv('AgeGroupDetails.csv')
india_covid_19 = pd.read_csv('covid_19_india.csv')
labels = list(age_details['AgeGroup'])
sizes = list(age_details['TotalCases'])
explode = []
for i in labels:
explode.append(0.05)
plt.figure(figsize= (15,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=9, explode =explode)
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('India - Age Group wise Distribution',fontsize = 20)
plt.axis('equal')
plt.tight_layout()
#We could see that the age group <40 is the most affected which is against the trend which says elderly people are more at risk of being affected. Only 17% of people >60 are affected
train_data = pd.read_csv("train.csv")#index_col=0
display(train_data.head())
test_data = pd.read_csv("test.csv")#index_col=0
display(test_data.head())
sum_df = pd.pivot_table(train_data, values=['ConfirmedCases','Fatalities'], index=['Date'],aggfunc=np.sum)
display(sum_df.max())
Lets create some new features, such as
-Daily Confirmed cases
-Daily Fatalities
-Growth factor (ratio of daily new cases to the previous day)
-Mortality rate (ratio of fatalities to the confirmed cases)
train_data['NewConfirmedCases'] = train_data['ConfirmedCases'] - train_data['ConfirmedCases'].shift(1)
train_data['NewConfirmedCases'] = train_data['NewConfirmedCases'].fillna(0.0)
train_data['NewFatalities'] = train_data['Fatalities'] - train_data['Fatalities'].shift(1)
train_data['NewFatalities'] = train_data['NewFatalities'].fillna(0.0)#.astype(int)
train_data['MortalityRate'] = train_data['Fatalities'] / train_data['ConfirmedCases']
train_data['MortalityRate'] = train_data['MortalityRate'].fillna(0.0)
train_data['GrowthRate'] = train_data['NewConfirmedCases']/train_data['NewConfirmedCases'].shift(1)
train_data['GrowthRate'] = train_data['GrowthRate'].replace([-np.inf, np.inf], 0.0)
train_data['GrowthRate'] = train_data['GrowthRate'].fillna(0.0)
display(train_data.head())
def getColumnInfo(df):
n_province = df['Province_State'].nunique()
n_country = df['Country_Region'].nunique()
n_days = df['Date'].nunique()
start_date = df['Date'].unique()[0]
end_date = df['Date'].unique()[-1]
return n_province, n_country, n_days, start_date, end_date
n_train = train_data.shape[0]
n_test = test_data.shape[0]
n_prov_train, n_count_train, n_train_days, start_date_train, end_date_train = getColumnInfo(train_data)
n_prov_test, n_count_test, n_test_days, start_date_test, end_date_test = getColumnInfo(test_data)
print ('<==Train data==> \n # of Province_State: '+str(n_prov_train),', # of Country_Region:'+str(n_count_train),
', Time Period: '+str(start_date_train)+' to '+str(end_date_train), '==> days:',str(n_train_days))
print("\n Countries with Province/State information: ", train_data[train_data['Province_State'].isna()==False]['Country_Region'].unique())
print ('\n <==Test data==> \n # of Province_State: '+str(n_prov_test),', # of Country_Region:'+str(n_count_test),
', Time Period: '+start_date_test+' to '+end_date_test, '==> days:',n_test_days)
df_test = test_data.loc[test_data.Date > '2020-04-14']
overlap_days = n_test_days - df_test.Date.nunique()
print('\n overlap days with training data: ', overlap_days, ', total days: ', n_train_days+n_test_days-overlap_days)
We need to do the predictions for 43 days, with overlap with of 13 days in training data that we will use to test our forecast model. Lets look at the data records with entries greater than zero.
prob_confirm_check_train = train_data.ConfirmedCases.value_counts(normalize=True)
prob_fatal_check_train = train_data.Fatalities.value_counts(normalize=True)
n_confirm_train = train_data.ConfirmedCases.value_counts()[1:].sum()
n_fatal_train = train_data.Fatalities.value_counts()[1:].sum()
print('Percentage of confirmed case records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_confirm_train, n_train, prob_confirm_check_train[1:].sum()*100))
print('Percentage of fatality records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_fatal_train, n_train, prob_fatal_check_train[1:].sum()*100))
train_data_by_country = train_data.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum', 'Fatalities': 'sum',
'GrowthRate':'last' })
#display(train_data_by_country.tail(10))
max_train_date = train_data['Date'].max()
train_data_by_country_confirm = train_data_by_country.query('(Date == @max_train_date) & (ConfirmedCases > 100)').sort_values('ConfirmedCases', ascending=False)
train_data_by_country_confirm.set_index('Country_Region', inplace=True)
train_data_by_country_confirm.style.background_gradient(cmap='Reds').format({'ConfirmedCases': "{:.0f}", 'GrowthRate': "{:.2f}"})
discrete_col = list(islice(cycle(['orange', 'r', 'g', 'k', 'b', 'c', 'm']), None, len(train_data_by_country_confirm.head(30))))
plt.rcParams.update({'font.size': 22})
train_data_by_country_confirm.head(20).plot(figsize=(20,15), kind='barh', color=discrete_col)
plt.legend(["Confirmed Cases", "Fatalities"]);
plt.xlabel("Number of Covid-19 Affectees")
plt.title("First 20 Countries with Highest Confirmed Cases")
ylocs, ylabs = plt.yticks()
for i, v in enumerate(train_data_by_country_confirm.head(20)["ConfirmedCases"][:]):
plt.text(v+0.01, ylocs[i]-0.25, str(int(v)), fontsize=12)
for i, v in enumerate(train_data_by_country_confirm.head(20)["Fatalities"][:]):
if v > 0: #disply for only >300 fatalities
plt.text(v+0.01,ylocs[i]+0.1,str(int(v)),fontsize=12)
Below are the plots of confirmed cases and fatalities for nations with fatalities > 600. In global case, I am also overlaying the new emerging confirmed cases as well as daily deaths, respectively.
import matplotlib.dates as dates
def reformat_time(reformat, ax):
ax.xaxis.set_major_locator(dates.WeekdayLocator())
ax.xaxis.set_major_formatter(dates.DateFormatter('%b %d'))
if reformat: #reformat again if you wish
date_list = train_data_by_date.reset_index()["Date"].tolist()
x_ticks = [dt.datetime.strftime(t,'%Y-%m-%d') for t in date_list]
x_ticks = [tick for i,tick in enumerate(x_ticks) if i%8==0 ]# split labels into same number of ticks as by pandas
ax.set_xticklabels(x_ticks, rotation=90)
# cosmetics
ax.yaxis.grid(linestyle='dotted')
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['left'].set_color('none')
ax.spines['bottom'].set_color('none')
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data_by_date = train_data.groupby(['Date'],as_index=True).agg({'ConfirmedCases': 'sum','Fatalities': 'sum',
'NewConfirmedCases':'sum', 'NewFatalities':'sum', 'MortalityRate':'mean'})
num0 = train_data_by_date._get_numeric_data()
num0[num0 < 0.0] = 0.0
#display(train_data_by_date.head())
## ======= Sort by countries with fatalities > 600 ========
train_data_by_country_max = train_data.groupby(['Country_Region'],as_index=True).agg({'ConfirmedCases': 'max', 'Fatalities': 'max'})
train_data_by_country_fatal = train_data_by_country_max[train_data_by_country_max['Fatalities']>600]
train_data_by_country_fatal = train_data_by_country_fatal.sort_values(by=['Fatalities'],ascending=False).reset_index()
#display(train_data_by_country_fatal.head(20))
df_merge_by_country = pd.merge(train_data,train_data_by_country_fatal['Country_Region'],on=['Country_Region'],how='inner')
df_max_fatality_country = df_merge_by_country.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum',
'Fatalities': 'sum',
'NewConfirmedCases':'sum',
'NewFatalities':'sum',
'MortalityRate':'mean'})
num1 = df_max_fatality_country._get_numeric_data()
num1[num1 < 0.0] = 0.0
df_max_fatality_country.set_index('Date',inplace=True)
#display(df_max_fatality_country.head(20))
countries = train_data_by_country_fatal['Country_Region'].unique()
plt.rcParams.update({'font.size': 16})
fig,(ax0,ax1) = plt.subplots(1,2,figsize=(15, 8))
fig,(ax2,ax3) = plt.subplots(1,2,figsize=(15, 8))#,sharey=True)
train_data_by_date.ConfirmedCases.plot(ax=ax0, x_compat=True, title='Confirmed Cases Globally', legend='Confirmed Cases',
color=discrete_col)#, logy=True)
reformat_time(0,ax0)
train_data_by_date.NewConfirmedCases.plot(ax=ax0, x_compat=True, linestyle='dotted', legend='New Confirmed Cases',
color=discrete_col)#, logy=True)
reformat_time(0,ax0)
train_data_by_date.Fatalities.plot(ax=ax2, x_compat=True, title='Fatalities Globally', legend='Fatalities', color='r')
reformat_time(0,ax2)
train_data_by_date.NewFatalities.plot(ax=ax2, x_compat=True, linestyle='dotted', legend='Daily Deaths',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax2)
for country in countries:
match = df_max_fatality_country.Country_Region==country
df_fatality_by_country = df_max_fatality_country[match]
df_fatality_by_country.ConfirmedCases.plot(ax=ax1, x_compat=True, title='Confirmed Cases Nationally')
reformat_time(0,ax1)
df_fatality_by_country.Fatalities.plot(ax=ax3, x_compat=True, title='Fatalities Nationally')
reformat_time(0,ax3)
#ax1.legend(countries)
#ax3.legend(countries)
ax1.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
ax3.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))
Confirmed Cases: It can be seen that a raise in maximum number of new cases appears in week of Feb 11-18, after which China reached its saturation point. Then a new sudden rise appears after March 24th, when the total new cases world wide crosses the total affectees in China alone.
Deaths: As can be seen, since March 11th, the death toll rises steeply due to extreme rise in European countires, specially Italy, Spain, France and UK, and as well as now in US. The average mortality rate in these countries below can explain the peaks in the global mortality rate.
fig = plt.figure()
fig,(ax4,ax5) = plt.subplots(1,2,figsize=(20, 8))
#train_data_by_date.loc[(train_data_by_date.ConfirmedCases > 20000)]#useless, its already summed.
train_data_by_date.MortalityRate.plot(ax=ax4, x_compat=True, legend='Mortality Rate',color='r')#tell pandas not to use its own datetime format
reformat_time(0,ax4)
for num, country in enumerate(countries):
match = df_max_fatality_country.Country_Region==country
df_fatality_by_country = df_max_fatality_country[match]
df_fatality_by_country.MortalityRate.plot(ax=ax5, x_compat=True, title='Average Mortality Rate Nationally')
reformat_time(0,ax5)
ax5.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))